In [1]:
import requests
from bs4 import BeautifulSoup
from mechanize import Browser
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time

Goal: get tripadvisor reviews for top Seattle attractions

1. Scrape data from page using requests and BeautifulSoup


In [2]:
# url = 'https://www.tripadvisor.com/Attraction_Review-g60878-d3184389-Reviews-Chihuly_Garden_and_Glass-Seattle_Washington.html#REVIEWS'

def get_reviews(response):
#     response = requests.get(url)
    soup = BeautifulSoup(response, 'html.parser')
    entries = soup.findAll('div', {'class': 'entry'})
    reviews = [entry.text.replace('\n', '') for entry in entries]
    return reviews

2. Use mechanize to get reviews for all of the top attractions


In [3]:
def mechanize_reviews(url):
    br = Browser()  # Initialize browser object
    br.set_handle_robots(False)  # try this if you get a 'disallowed by robots.txt' error
    # br.addheaders = [('User-agent', 'Firefox')]  # sometimes you need this line
    br.open(url)  # Retrieve the requested page
    br.select_form(nr=0)
    reviews = []
    for link in br.links():
        if 'Attraction_Review' in str(link):
            data = br.follow_link(link)
            reviews = get_reviews(data)
            if len(reviews) > 10:
                return reviews
    return reviews

Get reviews for top attractions in multiple cities


In [5]:
url = 'https://www.tripadvisor.com'
places = ['Portland, OR', 'San Francisco, CA', 'Seattle, WA']
chromedriver = '/Users/sydneydecoto/bin/chromedriver'
for place in places:
    # Initialize a chrome driver and go to url
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    # wait for page to load, time out after 10 seconds
    searchbox = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'GEO_SCOPED_SEARCH_INPUT')))
    searchbox.send_keys(place)
  
    mainsearch = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'mainSearch')))

    mainsearch.send_keys('Things to Do')
    driver.find_elements_by_class_name('inner')[0].click()
    driver.switch_to_alert()  # ignore the popup
    reviews = mechanize_reviews(driver.current_url)
#     print reviews
    driver.quit()

Finding forms with mechanize


In [17]:
br = Browser()  # Initialize browser object
br.set_handle_robots(False)  # try this if you get a 'disallowed by robots.txt' error
br.addheaders = [('User-agent', 'Firefox')]  # sometimes you need this line

url = 'https://seattle.craigslist.org/'
br.open(url)
for form in br.forms():
    print form


<POST https://seattle.craigslist.org/favorites application/x-www-form-urlencoded
  <HiddenControl(lastLink=) (readonly)>
  <HiddenControl(lastTitle=) (readonly)>
  <HiddenControl(fl=) (readonly)>
  <HiddenControl(uf=1) (readonly)>>
<GET https://seattle.craigslist.org/search/ application/x-www-form-urlencoded
  <HiddenControl(sort=rel) (readonly)>
  <TextControl(query=)>
  <SelectControl(catAbb=[ccc, eee, ggg, hhh, jjj, ppp, res, *sss, bbb])>
  <SubmitButtonControl(<None>=) (readonly)>>

In [ ]: